[Autogluon] (tripartite)_f1값이 너무 안 나오는데..(망함)

Author

김보람

Published

January 24, 2024

import

import pandas as pd
import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

# sklearn
import sklearn
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score

    
    def throw(df, fraud_rate):  # 사기 거래 비율에 맞춰 버려지는 함수!
        df1 = df[df['is_fraud'] == 1].copy()
        df0 = df[df['is_fraud'] == 0].copy()
        df0_downsample = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
        df0_down = df0.sample(frac=df0_downsample, random_state=42)
        df_p = pd.concat([df1, df0_down])
        return df_p
    
    def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3):
        n = len(data_frame)
    
        # 사기 거래와 정상 거래를 분리
        fraud_data = data_frame[data_frame['is_fraud'] == 1]
        normal_data = data_frame[data_frame['is_fraud'] == 0]

        # 테스트 데이터 크기 계산
        test_samples = int(test_fraud_rate * (n * test_rate))
        remaining_test_samples = int(n * test_rate) - test_samples
    
        # 사기 거래 및 정상 거래에서 무작위로 테스트 데이터 추출
        test_fraud_data = fraud_data.sample(n=test_samples, replace=False)
        test_normal_data = normal_data.sample(n=remaining_test_samples, replace=False)

        # 테스트 데이터 합치기
        test_data = pd.concat([test_normal_data, test_fraud_data])

        # 훈련 데이터 생성
        train_data = data_frame[~data_frame.index.isin(test_data.index)]

        return train_data, test_data
    
    def concat(df_tr, df_tst):   
        df = pd.concat([df_tr, df_tst])
        train_mask = np.concatenate((np.full(len(df_tr), True), np.full(len(df_tst), False)))    # index꼬이는거 방지하기 위해서? ★ (이거,, 훔,,?(
        test_mask =  np.concatenate((np.full(len(df_tr), False), np.full(len(df_tst), True))) 
        mask = (train_mask, test_mask)
        return df, mask
        
    def evaluation(y, yhat):
        metrics = [sklearn.metrics.accuracy_score,
                   sklearn.metrics.precision_score,
                   sklearn.metrics.recall_score,
                   sklearn.metrics.f1_score,
                   sklearn.metrics.roc_auc_score]
        return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})
        
    def compute_time_difference(group):
        n = len(group)
        result = []
        for i in range(n):
            for j in range(n):
                time_difference = abs((group.iloc[i].trans_date_trans_time - group.iloc[j].trans_date_trans_time).total_seconds())
                result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
        return result

    def edge_index_save(df, unique_col, theta, gamma):
        groups = df.groupby(unique_col)
        edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
        edge_index = edge_index.astype(np.float64)
        filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        
        while os.path.exists(filename):
            self.save_attempt += 1
            filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        np.save(filename, edge_index)
        #tetha = edge_index_plust_itme[:,].mean()
    
        
        edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
        edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
        return edge_index
    
    def edge_index(df, unique_col, theta, gamma):
        groups = df.groupby(unique_col)
        edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
        edge_index = edge_index.astype(np.float64)
       # filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        
        # while os.path.exists(filename):
        #     self.save_attempt += 1
        #     filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        # np.save(filename, edge_index)
        #tetha = edge_index_plust_itme[:,].mean()
    
        
        edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
        edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
        return edge_index
df = pd.read_csv("~/Desktop/fraudTrain.csv")

삼분그래프

def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    
    return G
    

지도학습

0.3 / 0.2

df = throw(df, 0.3)
df_tr, df_tst = split_dataframe(df, 0.2)
df_tr.shape
(14014, 23)
df_tst.shape
(6006, 23)
df_tr.is_fraud.mean()
0.34287141429998574
df_tst.is_fraud.mean()
0.19996669996669997
df_, mask = concat(df_tr, df_tst)
G_down = build_graph_tripartite(df_)
G_down.number_of_edges(), G_down.number_of_nodes()
(40040, 21656)
edges = G_down.edges
sg.StellarGraph(edges=edges, edge_type_column="label")
from sklearn.model_selection import train_test_split


train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
np.array(train_labels).mean(), np.array(test_labels).mean()
(0.2995442057942058, 0.30182317682317683)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))
import stellargraph as sg
from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.layer import GraphSAGE
from sklearn.model_selection import train_test_split
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
graph = sg.StellarGraph(G_down)
/tmp/ipykernel_3324862/2906379029.py:1: DeprecationWarning: Constructing a StellarGraph directly from a NetworkX graph has been replaced by the `StellarGraph.from_networkx` function
  graph = sg.StellarGraph(G_down)
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:44<00:00,  4.47s/it]
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
np.array(train_embeddings).shape
(32032, 128)
np.array(train_embeddings)
array([[8.79964884e-03, 1.64067835e-01, 7.30881765e-02, ...,
        1.81038718e-04, 1.96826290e-02, 6.05850630e-02],
       [3.18445265e-04, 9.41526145e-02, 2.41454929e-01, ...,
        5.79439476e-02, 9.22752500e-01, 2.49633682e-03],
       [1.09851332e-02, 4.12013801e-03, 1.61135435e-01, ...,
        1.24859456e-02, 2.32662242e-02, 1.07970215e-01],
       ...,
       [6.13867212e-03, 8.51532519e-02, 7.86146245e-07, ...,
        6.33678436e-02, 7.13208392e-02, 2.39914820e-01],
       [2.59715295e-03, 1.24797074e-03, 1.26128927e-01, ...,
        7.45704547e-02, 2.37582775e-04, 2.18050033e-01],
       [3.34992632e-02, 2.74142623e-03, 4.02400009e-02, ...,
        2.08475683e-02, 4.62760888e-02, 3.03567946e-01]], dtype=float32)
np.array(train_labels).shape
(32032,)

# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)

df_labels = pd.DataFrame(data=train_labels, columns=['label'])

# DataFrame 합치기
df = pd.concat([df_data, df_labels], axis=1)
df
X_0 X_1 X_2 X_3 X_4 X_5 X_6 X_7 X_8 X_9 ... X_119 X_120 X_121 X_122 X_123 X_124 X_125 X_126 X_127 label
0 8.799649e-03 0.164068 7.308818e-02 0.007493 0.000196 0.037998 0.026417 0.030260 0.045050 0.047880 ... 0.001140 0.082948 0.018060 0.041995 0.073492 0.011187 1.810387e-04 0.019683 0.060585 1
1 3.184453e-04 0.094153 2.414549e-01 0.016275 0.078201 0.015268 0.000326 0.331411 0.067376 0.061926 ... 0.223335 0.000269 0.041893 0.051856 0.191570 0.000165 5.794395e-02 0.922752 0.002496 0
2 1.098513e-02 0.004120 1.611354e-01 0.548673 0.025099 0.001323 0.005411 0.005461 0.021607 0.094634 ... 0.070138 0.038877 0.014588 0.003246 0.205133 0.064290 1.248595e-02 0.023266 0.107970 0
3 5.125533e-04 0.069008 2.394260e-02 0.038580 0.000345 0.058848 0.005284 0.168507 0.327026 0.002053 ... 0.001018 0.037014 0.015680 0.193011 0.000812 0.001481 4.044086e-02 0.065931 0.131638 0
4 2.551593e-01 0.011703 4.802953e-02 0.020095 0.177149 0.022541 0.196618 0.041765 0.000156 0.158181 ... 0.233692 0.104250 0.091681 0.002522 0.874027 0.003707 3.469664e-05 0.015489 0.005535 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
32027 1.361575e+00 0.023050 1.305244e-01 0.504093 0.014287 0.312939 0.000587 0.015945 0.049380 0.332058 ... 0.002336 0.000683 0.191192 0.356860 0.227310 0.000661 1.307885e-01 0.023592 0.041412 0
32028 9.765987e-07 0.037045 1.420536e-02 0.026103 0.000017 0.116524 0.065952 0.054716 0.154811 0.011176 ... 0.130505 0.006745 0.039672 0.052033 0.125356 0.015103 2.027648e-08 0.016023 0.169818 1
32029 6.138672e-03 0.085153 7.861462e-07 0.099862 0.408811 0.021244 0.006153 0.115799 0.000933 0.317840 ... 0.177379 0.565970 0.022457 0.043790 0.059339 0.257367 6.336784e-02 0.071321 0.239915 0
32030 2.597153e-03 0.001248 1.261289e-01 0.000002 0.001143 0.248826 0.006462 0.016343 0.015120 0.108578 ... 0.000021 0.000859 0.000894 0.496311 0.065650 0.007386 7.457045e-02 0.000238 0.218050 0
32031 3.349926e-02 0.002741 4.024000e-02 0.003076 0.000032 0.000168 0.065571 0.012371 0.112705 0.023244 ... 0.138493 0.010161 0.032122 0.063049 0.028025 0.001975 2.084757e-02 0.046276 0.303568 1

32032 rows × 129 columns

label = np.array(train_labels)
predictr = TabularPredictor(label='label')
No path specified. Models will be saved in: "AutogluonModels/ag-20240124_110704/"
predictr.fit(df) 
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240124_110704/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   623.47 GB / 982.82 GB (63.4%)
Train Data Rows:    32032
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [1, 0]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    25264.96 MB
    Train Data (Original)  Memory Usage: 16.4 MB (0.1% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    0.4s = Fit runtime
    128 features in original data used to generate 128 features in processed data.
    Train Data (Processed) Memory Usage: 16.4 MB (0.1% of available memory)
Data preprocessing and feature engineering runtime = 0.39s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.07804695304695304, Train Rows: 29532, Val Rows: 2500
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd7032fe50>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5904   = Validation score   (accuracy)
    0.07s    = Training   runtime
    0.18s    = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd7032fe50>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5904   = Validation score   (accuracy)
    0.07s    = Training   runtime
    0.08s    = Validation runtime
Fitting model: LightGBMXT ...
    0.726    = Validation score   (accuracy)
    1.07s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: LightGBM ...
    0.7248   = Validation score   (accuracy)
    0.94s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: RandomForestGini ...
    0.7236   = Validation score   (accuracy)
    6.19s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.7248   = Validation score   (accuracy)
    8.16s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: CatBoost ...
    0.7268   = Validation score   (accuracy)
    1.01s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: ExtraTreesGini ...
    0.7288   = Validation score   (accuracy)
    1.02s    = Training   runtime
    0.05s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.726    = Validation score   (accuracy)
    1.06s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 5: early stopping
    0.742    = Validation score   (accuracy)
    14.83s   = Training   runtime
    0.03s    = Validation runtime
Fitting model: XGBoost ...
    0.7244   = Validation score   (accuracy)
    1.27s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: NeuralNetTorch ...
    0.7432   = Validation score   (accuracy)
    11.94s   = Training   runtime
    0.11s    = Validation runtime
Fitting model: LightGBMLarge ...
    0.7268   = Validation score   (accuracy)
    1.99s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.7532   = Validation score   (accuracy)
    0.8s     = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 52.58s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240124_110704/")
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fcd0b85c640>
test = np.array(test_embeddings)
test.shape
(8008, 128)
columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

# DataFrame 확인
print(test_df.head())
        X_0       X_1       X_2       X_3       X_4       X_5       X_6  \
0  0.413663  2.075492  0.097850  0.431519  0.026412  0.046006  0.139835   
1  0.012225  0.000547  0.024713  0.703247  0.020913  0.419119  0.352671   
2  0.195774  0.000752  0.009002  0.204248  0.070720  0.906959  0.507191   
3  0.260361  0.329479  0.229454  0.023667  0.001113  0.002806  0.034591   
4  0.427026  0.024197  0.695513  0.057896  0.270117  0.026265  0.320806   

        X_7       X_8       X_9  ...     X_118     X_119     X_120     X_121  \
0  0.014063  0.473687  0.014283  ...  0.048837  0.010120  0.005523  0.961210   
1  1.669539  0.126232  0.027797  ...  0.728475  0.378981  1.377197  0.203648   
2  0.267592  0.115088  0.409537  ...  0.295417  0.440117  0.088578  0.060138   
3  0.045546  0.623125  0.282554  ...  0.174775  0.159482  0.001048  0.035379   
4  0.029051  0.263759  0.028128  ...  0.008239  0.063498  0.810952  0.013463   

      X_122     X_123     X_124     X_125     X_126     X_127  
0  0.026908  0.000516  0.054398  0.188361  0.007897  0.495728  
1  0.086590  0.230660  0.000614  1.211448  0.513626  0.000002  
2  0.004575  1.479674  0.131626  0.928811  0.315709  0.000010  
3  0.640979  0.329148  0.003226  0.278314  0.002369  0.150809  
4  0.295149  0.006146  0.608962  0.076195  0.106696  0.122096  

[5 rows x 128 columns]
predictr.predict(test_df).mean()
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd8f12cb80>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.0241008991008991
y = np.array(test_labels)
yhat = predictr.predict(test_df)
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b8c03a0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
# sklearn
import sklearn
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score

    
evaluation(y,yhat)
accuracy_score precision_score recall_score f1_score roc_auc_score
0 0.690684 0.264249 0.021375 0.03955 0.498058

0.3 / 0.3

df = throw(df, 0.3)

df_tr, df_tst = split_dataframe(df, 0.3)

df_, mask = concat(df_tr, df_tst)

G_down = build_graph_tripartite(df_)

train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)

edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))



node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)

classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)

df_labels = pd.DataFrame(data=train_labels, columns=['label'])

# DataFrame 합치기
df = pd.concat([df_data, df_labels], axis=1)


label = np.array(train_labels)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:46<00:00,  4.61s/it]
predictr = TabularPredictor(label='label')
No path specified. Models will be saved in: "AutogluonModels/ag-20240124_112803/"
predictr.fit(df) 
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240124_112803/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   622.65 GB / 982.82 GB (63.4%)
Train Data Rows:    32032
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [0, 1]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    24167.01 MB
    Train Data (Original)  Memory Usage: 16.4 MB (0.1% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    0.4s = Fit runtime
    128 features in original data used to generate 128 features in processed data.
    Train Data (Processed) Memory Usage: 16.4 MB (0.1% of available memory)
Data preprocessing and feature engineering runtime = 0.38s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.07804695304695304, Train Rows: 29532, Val Rows: 2500
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b8c00d0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.594    = Validation score   (accuracy)
    0.07s    = Training   runtime
    0.46s    = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b8c00d0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5956   = Validation score   (accuracy)
    0.09s    = Training   runtime
    0.49s    = Validation runtime
Fitting model: LightGBMXT ...
    0.7392   = Validation score   (accuracy)
    1.03s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: LightGBM ...
    0.74     = Validation score   (accuracy)
    1.08s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: RandomForestGini ...
    0.7304   = Validation score   (accuracy)
    6.74s    = Training   runtime
    0.05s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.7268   = Validation score   (accuracy)
    8.77s    = Training   runtime
    0.05s    = Validation runtime
Fitting model: CatBoost ...
    0.7396   = Validation score   (accuracy)
    1.06s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: ExtraTreesGini ...
    0.7328   = Validation score   (accuracy)
    1.12s    = Training   runtime
    0.05s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.7312   = Validation score   (accuracy)
    1.18s    = Training   runtime
    0.05s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 4: early stopping
    0.758    = Validation score   (accuracy)
    14.33s   = Training   runtime
    0.02s    = Validation runtime
Fitting model: XGBoost ...
    0.7376   = Validation score   (accuracy)
    1.16s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: NeuralNetTorch ...
    0.7464   = Validation score   (accuracy)
    10.13s   = Training   runtime
    0.09s    = Validation runtime
Fitting model: LightGBMLarge ...
    0.738    = Validation score   (accuracy)
    2.1s     = Training   runtime
    0.01s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.7708   = Validation score   (accuracy)
    0.8s     = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 52.42s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240124_112803/")
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fcd8f0e0190>
test = np.array(test_embeddings)
columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)
y = np.array(test_labels)

yhat = predictr.predict(test_df)
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fccf01554c0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
evaluation(y,yhat)
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
accuracy_score precision_score recall_score f1_score roc_auc_score
0 0.69493 0.0 0.0 0.0 0.5

0.3 / 0.4

df = pd.read_csv("~/Desktop/fraudTrain.csv")
df = throw(df, 0.3)
df_tr, df_tst = split_dataframe(df, 0.4)
df_, mask = concat(df_tr, df_tst)
G_down = build_graph_tripartite(df_)
train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))


node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)
df_labels = pd.DataFrame(data=train_labels, columns=['label'])

df = pd.concat([df_data, df_labels], axis=1)
label = np.array(train_labels)

predictr = TabularPredictor(label='label')

predictr.fit(df) 

test = np.array(test_embeddings)

columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

y = np.array(test_labels)
yhat = predictr.predict(test_df)

evaluation(y,yhat)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:48<00:00,  4.86s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240124_113911/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240124_113911/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   621.83 GB / 982.82 GB (63.3%)
Train Data Rows:    32032
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [0, 1]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    23867.75 MB
    Train Data (Original)  Memory Usage: 16.4 MB (0.1% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    0.3s = Fit runtime
    128 features in original data used to generate 128 features in processed data.
    Train Data (Processed) Memory Usage: 16.4 MB (0.1% of available memory)
Data preprocessing and feature engineering runtime = 0.36s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.07804695304695304, Train Rows: 29532, Val Rows: 2500
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b85e940>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5988   = Validation score   (accuracy)
    0.07s    = Training   runtime
    0.4s     = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b85e8b0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5992   = Validation score   (accuracy)
    0.08s    = Training   runtime
    0.47s    = Validation runtime
Fitting model: LightGBMXT ...
    0.738    = Validation score   (accuracy)
    0.99s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: LightGBM ...
    0.738    = Validation score   (accuracy)
    1.0s     = Training   runtime
    0.01s    = Validation runtime
Fitting model: RandomForestGini ...
    0.7324   = Validation score   (accuracy)
    6.23s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.7292   = Validation score   (accuracy)
    8.22s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: CatBoost ...
    0.7364   = Validation score   (accuracy)
    1.0s     = Training   runtime
    0.01s    = Validation runtime
Fitting model: ExtraTreesGini ...
    0.7372   = Validation score   (accuracy)
    1.07s    = Training   runtime
    0.05s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.7324   = Validation score   (accuracy)
    1.11s    = Training   runtime
    0.05s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 8: early stopping
    0.756    = Validation score   (accuracy)
    15.57s   = Training   runtime
    0.02s    = Validation runtime
Fitting model: XGBoost ...
    0.7352   = Validation score   (accuracy)
    1.1s     = Training   runtime
    0.01s    = Validation runtime
Fitting model: NeuralNetTorch ...
    0.7552   = Validation score   (accuracy)
    10.67s   = Training   runtime
    0.09s    = Validation runtime
Fitting model: LightGBMLarge ...
    0.738    = Validation score   (accuracy)
    2.21s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.7688   = Validation score   (accuracy)
    0.84s    = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 52.62s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240124_113911/")
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b85e790>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
accuracy_score precision_score recall_score f1_score roc_auc_score
0 0.688686 0.350318 0.045852 0.081091 0.504741

0.4 / 0.4

df = pd.read_csv("~/Desktop/fraudTrain.csv")
df = throw(df, 0.4)
df_tr, df_tst = split_dataframe(df, 0.4)
df_, mask = concat(df_tr, df_tst)
G_down = build_graph_tripartite(df_)
train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))


node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)
df_labels = pd.DataFrame(data=train_labels, columns=['label'])

df = pd.concat([df_data, df_labels], axis=1)
label = np.array(train_labels)

predictr = TabularPredictor(label='label')

predictr.fit(df) 

test = np.array(test_embeddings)

columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

y = np.array(test_labels)
yhat = predictr.predict(test_df)

evaluation(y,yhat)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:32<00:00,  3.23s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_002023/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_002023/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   620.00 GB / 982.82 GB (63.1%)
Train Data Rows:    24024
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [1, 0]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    24334.27 MB
    Train Data (Original)  Memory Usage: 12.3 MB (0.1% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    0.7s = Fit runtime
    128 features in original data used to generate 128 features in processed data.
    Train Data (Processed) Memory Usage: 12.3 MB (0.1% of available memory)
Data preprocessing and feature engineering runtime = 0.7s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 21621, Val Rows: 2403
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b85e1f0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5568   = Validation score   (accuracy)
    0.06s    = Training   runtime
    0.38s    = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fccf82ed550>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5576   = Validation score   (accuracy)
    0.07s    = Training   runtime
    0.45s    = Validation runtime
Fitting model: LightGBMXT ...
    0.6746   = Validation score   (accuracy)
    1.38s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: LightGBM ...
    0.6767   = Validation score   (accuracy)
    1.73s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: RandomForestGini ...
    0.6479   = Validation score   (accuracy)
    4.39s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.6583   = Validation score   (accuracy)
    6.06s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: CatBoost ...
    0.6804   = Validation score   (accuracy)
    6.64s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: ExtraTreesGini ...
    0.6558   = Validation score   (accuracy)
    0.8s     = Training   runtime
    0.04s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.6533   = Validation score   (accuracy)
    0.86s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 7: early stopping
    0.7012   = Validation score   (accuracy)
    11.36s   = Training   runtime
    0.02s    = Validation runtime
Fitting model: XGBoost ...
    0.6866   = Validation score   (accuracy)
    2.02s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: NeuralNetTorch ...
    0.707    = Validation score   (accuracy)
    7.38s    = Training   runtime
    0.09s    = Validation runtime
Fitting model: LightGBMLarge ...
    0.6758   = Validation score   (accuracy)
    3.62s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.7179   = Validation score   (accuracy)
    0.82s    = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 49.84s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240125_002023/")
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd700e30d0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
accuracy_score precision_score recall_score f1_score roc_auc_score
0 0.599234 0.0 0.0 0.0 0.499584

0.4 / 0.3

df = pd.read_csv("~/Desktop/fraudTrain.csv")
df = throw(df, 0.4)
df_tr, df_tst = split_dataframe(df, 0.3)
df_, mask = concat(df_tr, df_tst)
G_down = build_graph_tripartite(df_)
train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))


node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)
df_labels = pd.DataFrame(data=train_labels, columns=['label'])

df = pd.concat([df_data, df_labels], axis=1)
label = np.array(train_labels)

predictr = TabularPredictor(label='label')

predictr.fit(df) 

test = np.array(test_embeddings)

columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

y = np.array(test_labels)
yhat = predictr.predict(test_df)

evaluation(y,yhat)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:32<00:00,  3.28s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_002828/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_002828/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   619.33 GB / 982.82 GB (63.0%)
Train Data Rows:    24024
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [1, 0]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    24266.65 MB
    Train Data (Original)  Memory Usage: 12.3 MB (0.1% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    0.4s = Fit runtime
    128 features in original data used to generate 128 features in processed data.
    Train Data (Processed) Memory Usage: 12.3 MB (0.1% of available memory)
Data preprocessing and feature engineering runtime = 0.42s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 21621, Val Rows: 2403
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcb208ff4c0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5389   = Validation score   (accuracy)
    0.06s    = Training   runtime
    0.39s    = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcb208ff4c0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5397   = Validation score   (accuracy)
    0.07s    = Training   runtime
    0.45s    = Validation runtime
Fitting model: LightGBMXT ...
    0.6833   = Validation score   (accuracy)
    1.88s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: LightGBM ...
    0.6758   = Validation score   (accuracy)
    1.16s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: RandomForestGini ...
    0.6629   = Validation score   (accuracy)
    4.52s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.6633   = Validation score   (accuracy)
    6.25s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: CatBoost ...
    0.6733   = Validation score   (accuracy)
    2.82s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: ExtraTreesGini ...
    0.6604   = Validation score   (accuracy)
    0.81s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.6579   = Validation score   (accuracy)
    0.85s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 8: early stopping
    0.6991   = Validation score   (accuracy)
    11.81s   = Training   runtime
    0.02s    = Validation runtime
Fitting model: XGBoost ...
    0.6779   = Validation score   (accuracy)
    2.21s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: NeuralNetTorch ...
    0.6937   = Validation score   (accuracy)
    9.84s    = Training   runtime
    0.11s    = Validation runtime
Fitting model: LightGBMLarge ...
    0.6821   = Validation score   (accuracy)
    4.86s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.7116   = Validation score   (accuracy)
    0.84s    = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 50.43s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240125_002828/")
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd700e3af0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
accuracy_score precision_score recall_score f1_score roc_auc_score
0 0.594406 0.464115 0.040066 0.073764 0.504412

0.5 / 0.3

df = pd.read_csv("~/Desktop/fraudTrain.csv")
df = throw(df, 0.5)
df_tr, df_tst = split_dataframe(df, 0.3)
df_, mask = concat(df_tr, df_tst)
G_down = build_graph_tripartite(df_)
train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))


node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)
df_labels = pd.DataFrame(data=train_labels, columns=['label'])

df = pd.concat([df_data, df_labels], axis=1)
label = np.array(train_labels)

predictr = TabularPredictor(label='label')

predictr.fit(df) 

test = np.array(test_embeddings)

columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

y = np.array(test_labels)
yhat = predictr.predict(test_df)

evaluation(y,yhat)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:25<00:00,  2.54s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_004947/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_004947/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   617.43 GB / 982.82 GB (62.8%)
Train Data Rows:    19219
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [0, 1]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    24477.85 MB
    Train Data (Original)  Memory Usage: 9.84 MB (0.0% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    0.7s = Fit runtime
    128 features in original data used to generate 128 features in processed data.
    Train Data (Processed) Memory Usage: 9.84 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.69s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 17297, Val Rows: 1922
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd8f2fa700>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5484   = Validation score   (accuracy)
    0.05s    = Training   runtime
    0.3s     = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd8f2fa700>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5484   = Validation score   (accuracy)
    0.07s    = Training   runtime
    0.36s    = Validation runtime
Fitting model: LightGBMXT ...
    0.6556   = Validation score   (accuracy)
    1.15s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: LightGBM ...
    0.6493   = Validation score   (accuracy)
    1.14s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: RandomForestGini ...
    0.6379   = Validation score   (accuracy)
    3.71s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.6415   = Validation score   (accuracy)
    5.18s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: CatBoost ...
    0.6545   = Validation score   (accuracy)
    2.32s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: ExtraTreesGini ...
    0.6332   = Validation score   (accuracy)
    0.71s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.6498   = Validation score   (accuracy)
    0.73s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 4: early stopping
    0.6597   = Validation score   (accuracy)
    9.97s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: XGBoost ...
    0.629    = Validation score   (accuracy)
    1.53s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: NeuralNetTorch ...
    0.6514   = Validation score   (accuracy)
    5.69s    = Training   runtime
    0.08s    = Validation runtime
Fitting model: LightGBMLarge ...
    0.6472   = Validation score   (accuracy)
    3.91s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.6769   = Validation score   (accuracy)
    0.75s    = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 39.24s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240125_004947/")
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fccf0308310>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
accuracy_score precision_score recall_score f1_score roc_auc_score
0 0.508637 0.470588 0.00339 0.006731 0.499854

0.5 / 0.4

df = pd.read_csv("~/Desktop/fraudTrain.csv")
df = throw(df, 0.5)
df_tr, df_tst = split_dataframe(df, 0.4)
df_, mask = concat(df_tr, df_tst)
G_down = build_graph_tripartite(df_)
train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))


node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)
df_labels = pd.DataFrame(data=train_labels, columns=['label'])

df = pd.concat([df_data, df_labels], axis=1)
label = np.array(train_labels)

predictr = TabularPredictor(label='label')

predictr.fit(df) 

test = np.array(test_embeddings)

columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

y = np.array(test_labels)
yhat = predictr.predict(test_df)

evaluation(y,yhat)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:25<00:00,  2.57s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_004313/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_004313/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   617.99 GB / 982.82 GB (62.9%)
Train Data Rows:    19219
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [0, 1]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    24485.97 MB
    Train Data (Original)  Memory Usage: 9.84 MB (0.0% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    0.7s = Fit runtime
    128 features in original data used to generate 128 features in processed data.
    Train Data (Processed) Memory Usage: 9.84 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.68s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 17297, Val Rows: 1922
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fccf87d4790>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5375   = Validation score   (accuracy)
    0.05s    = Training   runtime
    0.3s     = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fccf87d4e50>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5375   = Validation score   (accuracy)
    0.05s    = Training   runtime
    0.38s    = Validation runtime
Fitting model: LightGBMXT ...
    0.6774   = Validation score   (accuracy)
    0.98s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: LightGBM ...
    0.667    = Validation score   (accuracy)
    1.03s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: RandomForestGini ...
    0.6592   = Validation score   (accuracy)
    3.68s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.6623   = Validation score   (accuracy)
    5.2s     = Training   runtime
    0.04s    = Validation runtime
Fitting model: CatBoost ...
    0.6727   = Validation score   (accuracy)
    3.06s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: ExtraTreesGini ...
    0.6649   = Validation score   (accuracy)
    0.7s     = Training   runtime
    0.04s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.6686   = Validation score   (accuracy)
    0.74s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 5: early stopping
    0.6649   = Validation score   (accuracy)
    10.27s   = Training   runtime
    0.02s    = Validation runtime
Fitting model: XGBoost ...
    0.6582   = Validation score   (accuracy)
    1.74s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: NeuralNetTorch ...
    0.6727   = Validation score   (accuracy)
    5.93s    = Training   runtime
    0.08s    = Validation runtime
Fitting model: LightGBMLarge ...
    0.6681   = Validation score   (accuracy)
    3.79s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.6915   = Validation score   (accuracy)
    0.74s    = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 40.32s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240125_004313/")
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd8f2579d0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
accuracy_score precision_score recall_score f1_score roc_auc_score
0 0.501977 0.0 0.0 0.0 0.5

0.5 / 0.2

df = pd.read_csv("~/Desktop/fraudTrain.csv")
df = throw(df, 0.5)
df_tr, df_tst = split_dataframe(df, 0.2)
df_, mask = concat(df_tr, df_tst)
G_down = build_graph_tripartite(df_)
train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))


node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)
df_labels = pd.DataFrame(data=train_labels, columns=['label'])

df = pd.concat([df_data, df_labels], axis=1)
label = np.array(train_labels)

predictr = TabularPredictor(label='label')

predictr.fit(df) 

test = np.array(test_embeddings)

columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

y = np.array(test_labels)
yhat = predictr.predict(test_df)

evaluation(y,yhat)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:25<00:00,  2.54s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_005617/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_005617/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   616.86 GB / 982.82 GB (62.8%)
Train Data Rows:    19219
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [0, 1]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    24455.01 MB
    Train Data (Original)  Memory Usage: 9.84 MB (0.0% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    0.6s = Fit runtime
    128 features in original data used to generate 128 features in processed data.
    Train Data (Processed) Memory Usage: 9.84 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.65s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 17297, Val Rows: 1922
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcb208fbca0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5182   = Validation score   (accuracy)
    0.05s    = Training   runtime
    0.3s     = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcb208fbca0>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.5193   = Validation score   (accuracy)
    0.07s    = Training   runtime
    0.38s    = Validation runtime
Fitting model: LightGBMXT ...
    0.6348   = Validation score   (accuracy)
    0.97s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: LightGBM ...
    0.6348   = Validation score   (accuracy)
    1.37s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: RandomForestGini ...
    0.6254   = Validation score   (accuracy)
    3.74s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.6337   = Validation score   (accuracy)
    5.22s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: CatBoost ...
    0.6462   = Validation score   (accuracy)
    3.83s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: ExtraTreesGini ...
    0.6384   = Validation score   (accuracy)
    0.69s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.6446   = Validation score   (accuracy)
    0.74s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 8: early stopping
    0.6426   = Validation score   (accuracy)
    9.51s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: XGBoost ...
    0.6301   = Validation score   (accuracy)
    1.54s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: NeuralNetTorch ...
    0.6415   = Validation score   (accuracy)
    5.31s    = Training   runtime
    0.08s    = Validation runtime
Fitting model: LightGBMLarge ...
    0.6379   = Validation score   (accuracy)
    2.72s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.6634   = Validation score   (accuracy)
    0.73s    = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 38.72s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240125_005617/")
accuracy_score precision_score recall_score f1_score roc_auc_score
0 0.510926 0.0 0.0 0.0 0.499796